# -*- coding: utf-8 -*-
"""
Functions that create similarity statistics used by decision tree to match legislative sections
Created on 10/11/21
Last updated: Apr 21 2023

Author: Karen Simpson and Jeremy Gelman

"""

import difflib
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import textdistance
from nltk.tokenize import word_tokenize  

def generate_ngrams(words_list, n):
    ngrams_list = []
    for num in range(0, len(words_list)):
        ngram = ' '.join(words_list[num:num + n])
        if len(words_list[num]) >=n:
            ngrams_list.append(ngram)
        else:
            pass
        return ngrams_list

def gen_hash(txt1, txt2, n, name1, name2):
    txt1 = txt1.split()
    txt2 = txt2.split()
    #dict_list=[]
    ngrams_list = []
    for num in range(0, len(txt1)):
        ngram = ' '.join(txt1[num:num + n])
        if len(txt1) - num >= n:
            ngrams_list.append(ngram)
        else:
            pass 
    dict1 = dict.fromkeys(ngrams_list, name1)    
    ngrams_list = [] 
    for num in range(0, len(txt2)):
        ngram = ' '.join(txt2[num:num + n])
        if len(txt2) - num >= n:
            ngrams_list.append(ngram)
        else:
            pass        
    dict2 = dict.fromkeys(ngrams_list, name2)    
    comp=[]    
    common_keys = dict1.keys() & dict2.keys()
    dict1_totalkeys = len(dict1)
    dict2_totalkeys = len(dict2)
    shared_keys = len(common_keys)
    if shared_keys==0:
        dict1_ad = 0
        dict2_ad = 0
        scope = 0
    else:
        dict1_ad = shared_keys/dict1_totalkeys
        dict2_ad = shared_keys/dict2_totalkeys
        scope = dict1_ad/dict2_ad    
    comp = [dict1_totalkeys, dict2_totalkeys, shared_keys, dict1_ad, dict2_ad, scope]
    dict1 = []
    dict2 = []
    return comp

def gen_hash_first100(txt1, txt2, n, name1, name2):
    txt1 = txt1.split()
    txt2 = txt2.split()
    txt1 = txt1[0:100]
    txt2 = txt2[0:100]
    #dict_list=[]
    ngrams_list = []
    for num in range(0, len(txt1)):
        ngram = ' '.join(txt1[num:num + n])
        if len(txt1) - num >= n:
            ngrams_list.append(ngram)
        else:
            pass 
    dict1 = dict.fromkeys(ngrams_list, name1)    
    ngrams_list = [] 
    for num in range(0, len(txt2)):
        ngram = ' '.join(txt2[num:num + n])
        if len(txt2) - num >= n:
            ngrams_list.append(ngram)
        else:
            pass        
    dict2 = dict.fromkeys(ngrams_list, name2)    
    comp=[]    
    common_keys = dict1.keys() & dict2.keys()
    dict1_totalkeys = len(dict1)
    dict2_totalkeys = len(dict2)
    shared_keys = len(common_keys)
    if shared_keys==0:
        dict1_ad = 0
        dict2_ad = 0
        scope = 0
    else:
        dict1_ad = shared_keys/dict1_totalkeys
        dict2_ad = shared_keys/dict2_totalkeys
        scope = dict1_ad/dict2_ad    
    comp = [dict1_totalkeys, dict2_totalkeys, shared_keys, dict1_ad, dict2_ad, scope]
    dict1 = []
    dict2 = []
    return comp


def blocks(txt1, txt2):
    all = []
    s = difflib.SequenceMatcher(None, txt1, txt2)
    long = s.find_longest_match(0, len(txt1), 0, len(txt2))
    longest = long.size
    num = s.get_matching_blocks()
    num_blocks = len(num)
    blocs = []
    for bloc in num:
        a = bloc.size
        blocs.append(a)
        total_len = sum(blocs)
        ave_len = total_len/num_blocks
    por_txt1 = total_len/len(txt1)
    por_txt2 = total_len/len(txt2)
    all = [longest, num_blocks, total_len, ave_len, por_txt1, por_txt2]
    return all


def dice(a,b): 
    s1t = word_tokenize(a)
    s2t = word_tokenize(b)
    return textdistance.sorensen_dice.normalized_similarity(s1t, s2t)
